{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "frTMl3sShA3P"
   },
   "outputs": [],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "height": 321,
     "output_extras": [
      {
       "item_id": 1
      }
     ]
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2880,
     "status": "error",
     "timestamp": 1505781339378,
     "user": {
      "displayName": "Sara Robinson",
      "photoUrl": "//lh4.googleusercontent.com/-RR9n0dvbwgI/AAAAAAAAAAI/AAAAAAAAMYM/SOr5ZExpvXE/s50-c-k-no/photo.jpg",
      "userId": "112510032804989247452"
     },
     "user_tz": 240
    },
    "id": "783h64rGhA3T",
    "outputId": "d447b2ab-e321-4ee5-abd4-de2c0116302f"
   },
   "outputs": [],
   "source": [
    "import itertools\n",
    "import os\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "from tensorflow import keras\n",
    "from keras.models import Model\n",
    "import keras.layers as layers\n",
    "\n",
    "# This code was tested with TensorFlow v1.5\n",
    "print(\"You have TensorFlow version\", tf.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "c7te21f7hA3V"
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"winemag-data_first150k.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "chAbp3ryhA3X",
    "outputId": "3a16921d-a3be-4c43-a5e2-6012220ee13a"
   },
   "outputs": [],
   "source": [
    "# Shuffle the data\n",
    "data = data.sample(frac=1)\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Do some preprocessing to limit the # of wine varities in the dataset\n",
    "data = data[pd.notnull(data['country'])]\n",
    "data = data[pd.notnull(data['price'])]\n",
    "data = data.drop(data.columns[0], axis=1) \n",
    "\n",
    "variety_threshold = 500 # Anything that occurs less than this will be removed.\n",
    "value_counts = data['variety'].value_counts()\n",
    "to_remove = value_counts[value_counts <= variety_threshold].index\n",
    "data.replace(to_remove, np.nan, inplace=True)\n",
    "data = data[pd.notnull(data['variety'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "h_SDal0khA3n",
    "outputId": "e6c311e5-c674-4cf2-f2dc-d6ceabfa6f83",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Split data into train and test\n",
    "train_size = int(len(data) * .8)\n",
    "print (\"Train size: %d\" % train_size)\n",
    "print (\"Test size: %d\" % (len(data) - train_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "anD38iilhA3r"
   },
   "outputs": [],
   "source": [
    "# Train features\n",
    "description_train = data['description'][:train_size]\n",
    "variety_train = data['variety'][:train_size]\n",
    "points_train = data['points'][:train_size]\n",
    "\n",
    "# Train labels\n",
    "labels_train = data['price'][:train_size]\n",
    "\n",
    "# Test features\n",
    "description_test = data['description'][train_size:]\n",
    "variety_test = data['variety'][train_size:]\n",
    "points_test = data['points'][train_size:]\n",
    "\n",
    "# Test labels\n",
    "labels_test = data['price'][train_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "z4GblctFhA3u"
   },
   "outputs": [],
   "source": [
    "# Create a tokenizer to preprocess our text descriptions\n",
    "vocab_size = 100000\n",
    "tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)\n",
    "tokenize.fit_on_texts(description_train) # only fit on train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "YatMLCKXhA3x"
   },
   "outputs": [],
   "source": [
    "# Wide feature 1: sparse bag of words (bow) vocab_size vector \n",
    "description_bow_train = tokenize.texts_to_matrix(description_train)\n",
    "description_bow_test = tokenize.texts_to_matrix(description_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "8quTsErLhA3z"
   },
   "outputs": [],
   "source": [
    "# Wide feature 2: one-hot vector of variety categories\n",
    "\n",
    "# Use sklearn utility to convert label strings to numbered index\n",
    "encoder = LabelEncoder()\n",
    "encoder.fit(variety_train)\n",
    "variety_train = encoder.transform(variety_train)\n",
    "variety_test = encoder.transform(variety_test)\n",
    "num_classes = np.max(variety_train) + 1\n",
    "\n",
    "# Convert labels to one hot\n",
    "variety_train = keras.utils.to_categorical(variety_train, num_classes)\n",
    "variety_test = keras.utils.to_categorical(variety_test, num_classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Wide feature 3 - points rating split into buckets\n",
    "# Create buckets column for points value\n",
    "points_train_buckets = []\n",
    "num_points_buckets = 4\n",
    "for i in range(len(points_train)):\n",
    "    bucket_arr = np.zeros(num_points_buckets)\n",
    "    bucket_index = math.ceil((points_train.iloc[i] - 80) / 5) - 1\n",
    "    bucket_arr[bucket_index] = 1\n",
    "    points_train_buckets.append(bucket_arr)\n",
    "\n",
    "\n",
    "points_test_buckets = []\n",
    "for i in range(len(points_test)):\n",
    "    bucket_arr = np.zeros(num_points_buckets)\n",
    "    bucket_index = math.ceil((points_test.iloc[i] - 80) / 5) - 1\n",
    "    bucket_arr[bucket_index] = 1\n",
    "    points_test_buckets.append(bucket_arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define our wide model with the functional API\n",
    "bow_inputs = layers.Input(shape=(vocab_size,))\n",
    "variety_inputs = layers.Input(shape=(num_classes,))\n",
    "points_inputs = layers.Input(shape=(num_points_buckets,))\n",
    "merged_layer = layers.concatenate([bow_inputs, variety_inputs,points_inputs])\n",
    "merged_layer = layers.Dense(256, activation='relu')(merged_layer)\n",
    "predictions = layers.Dense(1)(merged_layer)\n",
    "wide_model = Model(inputs=[bow_inputs, variety_inputs, points_inputs],outputs=predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])\n",
    "print(wide_model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Deep model feature 1: word embeddings of wine descriptions\n",
    "train_embed = tokenize.texts_to_sequences(description_train)\n",
    "test_embed = tokenize.texts_to_sequences(description_test)\n",
    "\n",
    "max_seq_length = 170\n",
    "train_embed = keras.preprocessing.sequence.pad_sequences(\n",
    "    train_embed, maxlen=max_seq_length, padding=\"post\")\n",
    "test_embed = keras.preprocessing.sequence.pad_sequences(\n",
    "    test_embed, maxlen=max_seq_length, padding=\"post\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define our deep model with the Functional API\n",
    "deep_inputs = layers.Input(shape=(max_seq_length,))\n",
    "embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)\n",
    "embedding = layers.Flatten()(embedding)\n",
    "embed_out = layers.Dense(1)(embedding)\n",
    "deep_model = Model(inputs=deep_inputs, outputs=embed_out)\n",
    "print(deep_model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "deep_model.compile(loss='mse',\n",
    "                       optimizer='adam',\n",
    "                       metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Combine wide and deep into one model\n",
    "merged_out = layers.concatenate([wide_model.output, deep_model.output])\n",
    "merged_out = layers.Dense(1)(merged_out)\n",
    "combined_model = Model(wide_model.input + [deep_model.input], merged_out)\n",
    "print(combined_model.summary())\n",
    "\n",
    "combined_model.compile(loss='mse',\n",
    "                       optimizer='adam',\n",
    "                       metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "gtP-hDRZhA31"
   },
   "outputs": [],
   "source": [
    "# Run training (need to run this for at least 5 epochs to get good accuracy, should probably do this on the cloud)\n",
    "combined_model.fit([description_bow_train, variety_train, np.asarray(points_train_buckets)] + [train_embed], labels_train, epochs=2, batch_size=256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined_model.evaluate([description_bow_test, variety_test, np.asarray(points_test_buckets)] + [test_embed], labels_test, batch_size=256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "f000lYoxhA4F",
    "outputId": "21cd198f-1979-4b40-a2fd-891a1c0248db"
   },
   "outputs": [],
   "source": [
    "# Generate predictions\n",
    "predictions = combined_model.predict([description_bow_test, variety_test, np.asarray(points_test_buckets)] + [test_embed])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare predictions with actual values for the first few items in our test dataset\n",
    "for i in range(35):\n",
    "    val = predictions[i]\n",
    "    print(description_test[i])\n",
    "    print(val[0], 'Actual: ', labels_test.iloc[i], '\\n')"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "default_view": {},
   "name": "josh3.ipynb",
   "provenance": [],
   "version": "0.3.2",
   "views": {}
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
